R Schnittstellen - Dritter Teil

Jan-Philipp Kolb

9 Mai 2017

Internetresourcen und Schnittstellen nutzen

JavaScript Object Notation

GeoJSON

OpenStreetMap Daten

Import von JSON-Objekten und XML Dateien

Import von JSON Dateien

JavaScript Object Notation (JSON)

Das GeoJSON Format

Die Struktur der Daten kann man sich mit einem JSON Viewer anschauen

Download von Beispieldaten

https://overpass-turbo.eu/

Exkurs OpenStreetMap Daten

Beispiele für GeoJSON

JSON importieren

library("jsonlite")
## Warning: package 'jsonlite' was built under R version 3.3.3
DRINKWATER <- fromJSON("data/RomDrinkingWater.geojson")
names(DRINKWATER)[1:3]
## [1] "type"      "generator" "copyright"
names(DRINKWATER)[4:5]
## [1] "timestamp" "features"

Die Daten anschauen

head(DRINKWATER$features)
##      type             id properties.@id properties.amenity properties.flow
## 1 Feature node/246574149 node/246574149     drinking_water     push-button
## 2 Feature node/246574150 node/246574150     drinking_water            <NA>
## 3 Feature node/246574151 node/246574151     drinking_water            <NA>
## 4 Feature node/248743324 node/248743324     drinking_water            <NA>
## 5 Feature node/251773348 node/251773348     drinking_water            <NA>
## 6 Feature node/251773551 node/251773551     drinking_water            <NA>
##   properties.type properties.name properties.name:fr properties.wheelchair
## 1          nasone            <NA>               <NA>                  <NA>
## 2            <NA>            <NA>               <NA>                  <NA>
## 3            <NA>            <NA>               <NA>                  <NA>
## 4            <NA>            <NA>               <NA>                  <NA>
## 5          nasone            <NA>               <NA>                  <NA>
## 6            <NA>    Acqua Marcia        Eau potable                   yes
##   properties.created_by properties.indoor geometry.type
## 1                  <NA>              <NA>         Point
## 2                  <NA>              <NA>         Point
## 3                  <NA>              <NA>         Point
## 4                  <NA>              <NA>         Point
## 5                  <NA>              <NA>         Point
## 6                  <NA>              <NA>         Point
##   geometry.coordinates
## 1   12.49191, 41.89479
## 2   12.49095, 41.89489
## 3   12.48774, 41.89450
## 4   12.48773, 41.89354
## 5   12.48529, 41.88539
## 6   12.48386, 41.89332

Github JSON Daten

my_repos <- fromJSON("https://api.github.com/users/japhilko/repos")
names(my_repos)
##  [1] "id"                "name"              "full_name"        
##  [4] "owner"             "private"           "html_url"         
##  [7] "description"       "fork"              "url"              
## [10] "forks_url"         "keys_url"          "collaborators_url"
## [13] "teams_url"         "hooks_url"         "issue_events_url" 
## [16] "events_url"        "assignees_url"     "branches_url"     
## [19] "tags_url"          "blobs_url"         "git_tags_url"     
## [22] "git_refs_url"      "trees_url"         "statuses_url"     
## [25] "languages_url"     "stargazers_url"    "contributors_url" 
## [28] "subscribers_url"   "subscription_url"  "commits_url"      
## [31] "git_commits_url"   "comments_url"      "issue_comment_url"
## [34] "contents_url"      "compare_url"       "merges_url"       
## [37] "archive_url"       "downloads_url"     "issues_url"       
## [40] "pulls_url"         "milestones_url"    "notifications_url"
## [43] "labels_url"        "releases_url"      "deployments_url"  
## [46] "created_at"        "updated_at"        "pushed_at"        
## [49] "git_url"           "ssh_url"           "clone_url"        
## [52] "svn_url"           "homepage"          "size"             
## [55] "stargazers_count"  "watchers_count"    "language"         
## [58] "has_issues"        "has_projects"      "has_downloads"    
## [61] "has_wiki"          "has_pages"         "forks_count"      
## [64] "mirror_url"        "open_issues_count" "forks"            
## [67] "open_issues"       "watchers"          "default_branch"

Weiteres Beispiel für JSON Daten

Ergast Daten lesen

library(jsonlite)
res <- fromJSON('http://ergast.com/api/f1/2004/1/results.json')
drivers <- res$MRData$RaceTable$Races$Results[[1]]$Driver
colnames(drivers)
## [1] "driverId"        "code"            "url"             "givenName"      
## [5] "familyName"      "dateOfBirth"     "nationality"     "permanentNumber"

Daten der New York Times

New York Times Beispiel

article_key <- "&api-key=c2fede7bd9aea57c898f538e5ec0a1ee:6:68700045"
url <- "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=obamacare+socialism"
req <- fromJSON(paste0(url, article_key))
articles <- req$response$docs
colnames(articles)
##  [1] "web_url"           "snippet"           "lead_paragraph"   
##  [4] "abstract"          "print_page"        "blog"             
##  [7] "source"            "multimedia"        "headline"         
## [10] "keywords"          "pub_date"          "document_type"    
## [13] "news_desk"         "section_name"      "subsection_name"  
## [16] "byline"            "type_of_material"  "_id"              
## [19] "word_count"        "slideshow_credits"

Import von XML Dateien

Das XML Paket

library(XML)
citation("XML")
## 
## To cite package 'XML' in publications use:
## 
##   Duncan Temple Lang and the CRAN Team (2016). XML: Tools for
##   Parsing and Generating XML Within R and S-Plus. R package
##   version 3.98-1.5. https://CRAN.R-project.org/package=XML
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {XML: Tools for Parsing and Generating XML Within R and S-Plus},
##     author = {Duncan Temple Lang and the CRAN Team},
##     year = {2016},
##     note = {R package version 3.98-1.5},
##     url = {https://CRAN.R-project.org/package=XML},
##   }
## 
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.

Erstes Beispiel

url <- "http://api.openstreetmap.org/api/0.6/
relation/62422"
library(xml2)
BE <- xmlParse(url)
Administrative Grenzen Berlin

Administrative Grenzen Berlin

Das XML analysieren

xmltop = xmlRoot(BE)
class(xmltop)
## [1] "XMLInternalElementNode" "XMLInternalNode"       
## [3] "XMLAbstractNode"
xmlSize(xmltop)
## [1] 1
xmlSize(xmltop[[1]])
## [1] 328

Nutzung von Xpath

Xpath, the XML Path Language, is a query language for selecting nodes from an XML document.

xpathApply(BE,"//tag[@k = 'source:population']")
## [[1]]
## <tag k="source:population" v="http://www.statistik-berlin-brandenburg.de/Publikationen/Stat_Berichte/2010/SB_A1-1_A2-4_q01-10_BE.pdf 2010-10-01"/> 
## 
## attr(,"class")
## [1] "XMLNodeSet"

Node parsen

url2 <- "http://api.openstreetmap.org/api/0.6/node/2923760808"
RennesBa <- xmlParse(url2)

Way parsen

url3 <- "http://api.openstreetmap.org/api/0.6/way/72799743"
MadCalle <- xmlParse(url3)

Mehr Beispiele, wie man mit XML Daten umgeht:

http://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf

http://www.omegahat.net/RSXML/shortIntro.pdf

Noch mehr Informationen

http://www.di.fc.ul.pt/~jpn/r/web/index.html#parsing-xml

http://www.w3schools.com/xml/xquery_intro.asp

http://giventhedata.blogspot.de/2012/06/r-and-web-for-beginners-part-ii-xml-in.html

http://gastonsanchez.com/Handling_and_Processing_Strings_in_R.pdf

Referenzen

citation("XML")
## 
## To cite package 'XML' in publications use:
## 
##   Duncan Temple Lang and the CRAN Team (2016). XML: Tools for
##   Parsing and Generating XML Within R and S-Plus. R package
##   version 3.98-1.5. https://CRAN.R-project.org/package=XML
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {XML: Tools for Parsing and Generating XML Within R and S-Plus},
##     author = {Duncan Temple Lang and the CRAN Team},
##     year = {2016},
##     note = {R package version 3.98-1.5},
##     url = {https://CRAN.R-project.org/package=XML},
##   }
## 
## ATTENTION: This citation information has been auto-generated from
## the package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.

Die Pakete rvest und RCurl

Das Paket rvest

library(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))
##  [1] "http://theguitarrepairworkshop.com/"                                                                   
##  [2] "http://www.guitarservices.com/"                                                                        
##  [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
##  [4] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"                                  
##  [5] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"                                       
##  [6] "http://www.laweekly.com/music/10-best-guitar-repair-shops-in-los-angeles-4647166"                      
##  [7] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"                                   
##  [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"                                
##  [9] "http://guitarworkshopglasgow.com/pages/repairs-1"                                                      
## [10] "http://www.google.co.in/aclk?sa=l"

Webscraping

Notwendige Pakete

install.packages("tidyverse")
library(tidyverse)
library(stringr)
library(forcats)
library(ggmap)
library(rvest)

SCRAPE DATA FROM WIKIPEDIA

html.world_ports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_container_ports")
df.world_ports <- html_table(html_nodes(html.world_ports, "table")[[2]], fill = TRUE)

Die Daten anschauen

glimpse(df.world_ports)
## Observations: 50
## Variables: 15
## $ Rank     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Port     <chr> "Shanghai", "Singapore", "Shenzhen", "Ningbo-Zhoushan...
## $ Economy  <chr> "China", "Singapore", "China", "China", "Hong Kong", ...
## $ 2015[1]  <chr> "36,516", "30,922", "24,142", "20,636", "20,073", "19...
## $ 2014[2]  <chr> "35,268", "33,869", "23,798", "19,450", "22,374", "18...
## $ 2013[3]  <chr> "33,617", "32,240", "23,280", "17,351", "22,352", "17...
## $ 2012[4]  <chr> "32,529", "31,649", "22,940", "16,670", "23,117", "17...
## $ 2011[5]  <chr> "31,700", "29,937", "22,570", "14,686", "24,384", "16...
## $ 2010[6]  <chr> "29,069", "28,431", "22,510", "13,144", "23,532", "14...
## $ 2009[7]  <chr> "25,002", "25,866", "18,250", "10,502", "20,983", "11...
## $ 2008[8]  <chr> "27,980", "29,918", "21,414", "11,226", "24,248", "13...
## $ 2007[9]  <chr> "26,150", "27,932", "21,099", "9,349", "23,881", "13,...
## $ 2006[10] <chr> "21,710", "24,792", "18,469", "7,068", "23,539", "12,...
## $ 2005[11] <chr> "18,084", "23,192", "16,197", "5,208", "22,427", "11,...
## $ 2004[12] <chr> "14,557", "21,329", "13,615", "4,006", "21,984", "11,...

Das Paket rvest

library(rvest)
ht <- read_html('https://www.google.co.in/search?q=guitar+repair+workshop')
links <- ht %>% html_nodes(xpath='//h3/a') %>% html_attr('href')
gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))
##  [1] "http://theguitarrepairworkshop.com/"                                                                   
##  [2] "http://www.guitarservices.com/"                                                                        
##  [3] "http://www.guitarrepairbench.com/guitar-building-projects/guitar-workshop/guitar-workshop-project.html"
##  [4] "https://www.facebook.com/The-Guitar-Repair-Workshop-847517635259712/"                                  
##  [5] "https://www.taylorguitars.com/dealer/guitar-repair-workshop-ltd"                                       
##  [6] "http://www.laweekly.com/music/10-best-guitar-repair-shops-in-los-angeles-4647166"                      
##  [7] "https://www.justdial.com/Mumbai/Guitar-Repair-Services/nct-10988623"                                   
##  [8] "https://www.justdial.com/Delhi-NCR/Guitar-Repair-Services/nct-10988623"                                
##  [9] "http://guitarworkshopglasgow.com/pages/repairs-1"                                                      
## [10] "http://www.google.co.in/aclk?sa=l"

Use Case - Scraping Wikipedia

Einleitung

Im Folgenden werde ich zeigen, wie man Textinformationen aus Wikipedia herunterladen, verarbeiten und analysieren kann.

install.packages("NLP")
install.packages("tm")
install.packages("FactoMineR")

Die verwendeten Pakete

library("stringi")
library("tm")
library("FactoMineR")

Die Text Daten herunterladen

wiki <- "http://de.wikipedia.org/wiki/"

titles <- c("Zika-Virus", "Influenza-A-Virus_H1N1", 
            "Spanische_Grippe","Influenzavirus",
            "Vogelgrippe_H5N1",
            "Legionellose-Ausbruch_in_Warstein_2013",
            "Legionellose-Ausbruch_in_Jülich_2014")

Das Herunterladen der Seiten

articles <- character(length(titles))

for (i in 1:length(titles)){
    articles[i] <- stri_flatten(
      readLines(stri_paste(wiki, titles[i])), col = " ")
}

docs <- Corpus(VectorSource(articles))

Die Daten vorbereiten

Das Folgende basiert auf einem Blogpost von Norbert Ryciak über die automatische Kategorisierung von Wikipedia-Artikeln.

docs2 <- tm_map(docs, function(x) stri_replace_all_regex(
  x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(
  x, "\t", " "))

Den Text weiterverarbeiten

docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("german"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, tolower)
# docs8 <- tm_map(docs8, PlainTextDocument)
dtm <- DocumentTermMatrix(docs8)  

Principal Component Analysis

dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
words <- frequency[frequency>20]
s <- dtm2[1,which(colnames(dtm2) %in% names(words))]

for(i in 2:nrow(dtm2)){
  s <- cbind(s,dtm2[i,which(colnames(dtm2) %in% 
                              names(words))])
} 

colnames(s) <- titles

Ergebnis

PCA(s)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 125 individuals, described by 7 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

Ergebnis

Das Dendogramm

s0 <- s/apply(s,1,sd)
h <- hclust(dist(t(s0)), method = "ward")

plot(h, labels = titles, sub = "")

Applikationen und Projektverwaltung mit Rstudio und git

Shiny Apps

Der Start

Die erste App

R und Git

Rstudio und git - ein Projekt anlegen

Ein Projekt mit Versionskontrolle

Auswahl Versionskontrolle

Ein Projekt clonen

Der git-Reiter in Rstudio

Aktuelle eigene Änderungen committen

Commands

git commit
git push

http://stackoverflow.com/questions/1125968/force-git-to-overwrite-local-files-on-pull

Problems with disk space

WinDirStat https://support.microsoft.com/de-de/kb/912997 http://www.pcwelt.de/tipps/Update-Dateien-loeschen-8357046.html

Quelle für Pakete

Datensätze Suchfunktion

Hochperfomanter Code

C++ Integration - Überblick über das Paket rcpp

Warum die Integration von c++

Robert Gentleman, in R Programming for Bioinformatics, 2008, about R’s built-in C interfaces:

Since R is not compiled, in some situations its performance can be substantially improved by writing code in a compiled language. There are also reasons not to write code in other languages, and in particular we caution against premature optimization, prototyping in R is often cost effective. And in our experience very few routines need to be implemented in other languages for effiiency reasons. Another substantial reason not to use an implementation in some other language is increased complexity. The use of another language almost always results in higher maintenance costs and less stability. In addition, any extensions or enhancements of the code will require someone that is proficient in both R and the other language.

Warum und wann?

Voraussetzung Compiler

Für Windows, Rtools

  1. http://cran.r-project.org/bin/windows/Rtools/
  2. http://cran.r-project.org/doc/manuals/R-admin.html#The-Windows-toolset

Für Mac, Xcode

  1. http://cran.r-project.org/doc/manuals/R-admin.html#Installing-R-under-_0028Mac_0029-OS-X
  2. http://cran.r-project.org/doc/manuals/R-admin.html#Mac-OS-X

Was wir nutzen werden

Wir werden die folgenden beiden Pakete nutzen:

Rcpp

Einleitung

Das R-Paket CPP

install.packages("Rcpp")
library(Rcpp)
cppFunction('int add(int x, int y, int z) {
  int sum = x + y + z;
  return sum;
}')
# add works like a regular R function
add
add(1, 2, 3)

Rcpp

Tutorial on Rcpp by Hadley Wickham

library(Rcpp)
cppFunction('int add(int x, int y, int z) {
  int sum = x + y + z;
  return sum;
}')
add(1, 2, 3)

Benchmarking

install.packages("microbenchmark")
library(microbenchmark)

Resourcen

Überblick über Möglichkeiten des Parallel Computings - Paket parallel

Integration von Datenbanken

Datenbanken und R

Integration von PostgreSQL mit dem Paket

RPostgreSQL

PostgreSQL

PostgreSQL

PostgreSQL

PostgreSQL installieren

PG admin installieren

Wie bekomme ich Daten in die Datenbank

# install.packages("RPostgreSQL")
library("RPostgreSQL")

Geodaten in die Datenbank migrieren

sudo -u postgres createuser Japhilko
sudo -u postgres createdb -E UTF8 -O Japhilko offlgeoc

Die postgis Erweiterung muss für die Datenbank installiert werden:

CREATE EXTENSION postgis;

Erweiterung hstore

CREATE EXTENSION hstore;
osm2pgsql -s -U postgres -d offlgeoc /home/kolb/Forschung/osmData/data/saarland-latest.osm.pbf 

Datenbank für Geocoding

sudo -u postgres createdb -E UTF8 -O Japhilko offlgeocRLP
CREATE EXTENSION postgis;
osm2pgsql -s -U postgres -d offlgeocRLP -o gazetteer /home/kolb/Forschung/osmData/data/rheinland-pfalz-latest.osm.pbf 

So bekommt man alle administrativen Grenzen:

SELECT name FROM planet_osm_polygon WHERE boundary='administrative'

Zurück zu R

pw <- {"1234"}
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "offlgeocRLP",
                 host = "localhost", port = 5432,
                 user = "postgres", password = pw)
rm(pw) # removes the password

dbExistsTable(con, "planet_osm_polygon")
df_postgres <- dbGetQuery(con, "SELECT name, admin_level FROM planet_osm_polygon WHERE boundary='administrative'")
barplot(table(df_postgres[,2]),col="royalblue")

df_adm8 <- dbGetQuery(con, "SELECT name, admin_level FROM planet_osm_polygon WHERE boundary='administrative' AND admin_level='8'")
library(knitr)
# kable(head(df_adm8))

df_hnr <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point 
WHERE planet_osm_line.name='Nordring' AND planet_osm_line.highway IN ('motorway','trunk','primary')
AND planet_osm_point.name='Ludwigshafen' AND planet_osm_point.place IN ('city', 'town')
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
df_hnr <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point 
WHERE planet_osm_line.name='Nordring' AND planet_osm_point.name='Ludwigshafen' 
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_hnr)
df_ <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point 
WHERE planet_osm_line.name='Nordring' AND planet_osm_point.name='Ludwigshafen' 
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_hnr)
colnames(df_)
table(df_$name)

Adress in Sippersfeld

df_sipp <- dbGetQuery(con, "SELECT * FROM planet_osm_line, planet_osm_point 
WHERE planet_osm_line.name='Rechweg' AND planet_osm_point.name='Sippersfeld' 
ORDER BY ST_Distance(planet_osm_line.way, planet_osm_point.way)")
head(df_sipp)

OpenStreetMap und Open Government Data in PostGIS

restnam <- dbGetQuery(con, "SELECT name, COUNT(osm_id) AS anzahl
FROM planet_osm_point
WHERE amenity = 'restaurant'
  AND name <> ''
GROUP BY name
ORDER BY anzahl DESC
LIMIT 10")
head(restnam)

PostgreSQL and Leaflet

install.packages("plot3D")
library(plot3D)
library(RPostgreSQL)

Nutzung von MongoDB in R